suppressMessages({
library(stringr)
library(tidyverse)
library(gdata)
library(base)
library(scales)
library(RColorBrewer)
library(igraph)
library(qgraph)
library(ggpubr)
library(ggrepel)
library(ggpattern)
library(purrr)
library(ergm)
library(intergraph)
library(netplot)
library(broom)
library(pander)
library(gtools)
})
Cleaned dataset Format
from, text, directed, to1. Open csv dataset and remove blank spaces
# Open csv file and add n_words column
load_and_n_words <- function(movie_name){
df_name <- paste0('directed-datasets/', movie_name, '-cl-directed.csv')
df <- read.csv(df_name, na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8")
for (i in 1:nrow(df)){
df$nwords[i] <- str_count(df$text[i], '\\w+')
}
return(df)
}
# Remove leading and trailing blanks in name and text
remove_empty_space2 <- function(df){
df$from <- trim(df$from)
df$text <- trim(df$text)
df$directed <- trim(df$directed)
df$to <- trim(df$to)
return(df)
}
2. Compute number of words
# 1) Create a df with number of words of ONLY DIRECTED dialogues
create_df_directed <- function(df){
df_directed <- df %>%
filter(df$directed == 1) %>%
group_by(from, to) %>%
summarize(weight = sum(nwords))
df_directed <- merge(df_directed, df_0_all_movies[,c("from", "to", "movie")], by = c("from", "to"), all.x = TRUE)
df_directed <- df_directed[!duplicated(df_directed), ]
row.names(df_directed) <- 1:nrow(df_directed)
return(df_directed)
}
# 2) Create a df with number of words of ALL dialogues
create_df_both <- function(df){
df_both <- df %>%
group_by(from) %>%
summarize(n_words_total = sum(nwords))
row.names(df_both) <- 1:nrow(df_both)
return(df_both)
}
3. Create igraph object
###라고 표시한 부분을 수정했음
create_igraph <- function(df_movie){
# Compute number of words
# for (1) directed dialogues and (2) both directed and undirected
my_df_dir <- df_movie %>%
create_df_directed()
my_df_both <- df_movie %>%
create_df_both()
# Create character matrix (to create adjacency matrix)
chars <- my_df_both$from
from <- c()
to <- c()
for (i in 1:length(chars)){
from <- append(from, rep(chars[i], times = length(chars)))
}
to <- append(to, rep(chars, times = length(chars)))
my_char_mat <- data.frame(from, to) # my char matrix
remove(chars, from, to) # remove unnecessary objects
## Merge n_words_dialogue to each directed dialogue
my_char_mat <- merge(my_char_mat,
my_df_dir[,c("from", "to", "weight")],
by = c("from", "to"),
all.x = TRUE)
## Remove duplicates
my_char_mat <- my_char_mat[!duplicated(my_char_mat), ]
## Merge gender to my_df_both
named_char <- read.csv('0-named-char.csv', na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8") %>%
rename(from = X.U.FEFF.from)
my_df_both <- merge(my_df_both, named_char[,c("from", "gender")], by = "from", all.x = TRUE)
# Create graph object
v <- unique(c(my_char_mat[,1],
my_char_mat[,2]))
v <- na.omit(v)
e <- na.omit(my_char_mat)
g <- graph.data.frame(e, vertices = v, directed = T)
# Add attributes
V(g)$n_words_total <- my_df_both$n_words_total
V(g)$gender <- my_df_both$gender
return(g)
} # end of function definition
3.1 Visualize character network
# 모든 영화를 한번에 plot 하기 - 얘만하면 됨
plot_char_net_rm_isolates <- function(my_network, my_movie){
# remove isolates
my_network <- delete_vertices(my_network, degree(my_network) == 0)
# layout
e <- get.edgelist(my_network, names=FALSE)
if (gorder(my_network) < 18){
my_layout <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(my_network),
area=2.2*(vcount(my_network)^1.15), repulse.rad=(vcount(my_network)^1))
} else if (gorder(my_network) < 35){
my_layout <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(my_network),
area=2.25*(vcount(my_network)^2.1), repulse.rad=(vcount(my_network)^1.5))
} else {
my_layout <- qgraph.layout.fruchtermanreingold(e, vcount=vcount(my_network),
area=3*(vcount(my_network)^2.1), repulse.rad=(vcount(my_network)^1.5))
}
# plot
plot.igraph(my_network,
vertex.size = rescale(V(my_network)$n_words_total, c(3, 25)),
vertex.color = adjustcolor(ifelse(V(my_network)$gender == 0, "#009E73",
ifelse(V(my_network)$gender == 1, "#E69F00", "#999999")),
alpha.f = .6),
vertex.label = ifelse(V(my_network)$name %in% named_char_major$from, V(my_network)$name, ""),
vertex.label.cex = rescale(V(my_network)$n_words_total, c(0.8, 1.6)),
vertex.label.family = "Cambria",
vertex.label.font = 2, # bold text
vertex.label.color = "black",
edge.width = rescale(E(my_network)$weight, c(0.05, 10)), ###
edge.arrow.size = 0.45,
edge.color = adjustcolor("azure3", alpha.f = .6),
edge.curved = 0.25,
layout = my_layout,
main = paste0("Character Network for <", my_movie, ">")
)
legend("topright", legend = c("Woman", "Man"),
inset=c(0.05, 0), cex = 0.9, pch = 19,
col = c("#009E73", "#E69F00"), title = "Gender"
)
}
4. Compute centrality
# Compute centrality
compute_centrality <- function(my_network, my_movie){
# In- and out-degree centrality
id <- degree(my_network, mode="in")
od <- degree(my_network, mode = "out")
# Betweenness
bet <- betweenness(my_network, weights = E(my_network)$weight)
# Closeness
clo <- closeness(my_network, weights = E(my_network)$weight)
# n_words_total
n_words_total <- V(my_network)$n_words_total
my_df <- data.frame(from=names(id), id=id, od=od, bet=bet, clo=clo, n_words = n_words_total, row.names=NULL)
my_df$movie <- my_movie
remove(id, od, bet, clo)
return(my_df)
}
xxxx-cl-directed.csv)# movie list in alphabetical order
movie_list <- c("aladdin19", "aladdin", "beauty17", "beauty", "brave", "cind", "frog", "frozen2", "frozen",
"incredibles1", "incredibles2", "inside", "mermaid2", "mermaid", "moana", "mulan", "poca", "raya",
"sleeping", "snow", "tangled", "toystory1", "toystory2", "toystory3", "toystory4")
for (i in 1:length(movie_list)){
tmp <- load_and_n_words(movie_list[i])
tmp$movie <- movie_list[i]
assign(paste0("df_3_", movie_list[i]), tmp)
}
remove(tmp)
# Merge all preprocessed data of all films
# movie list in release year order
movie_list <- c("snow", "cind", "sleeping", "mermaid", "mermaid2", "beauty", "beauty17", "aladdin", "aladdin19",
"poca", "mulan", "frog", "tangled", "brave", "frozen", "frozen2", "moana", "raya",
"toystory1", "toystory2", "toystory3", "toystory4",
"incredibles1", "incredibles2", "inside")
df_0_all_movies <- data.frame()
for (i in 1:length(movie_list)){
df_tmp <- load_and_n_words(movie_list[i])
df_tmp$movie <- movie_list[i]
df_0_all_movies <- rbind(df_0_all_movies, df_tmp) %>%
remove_empty_space2
}
remove(df_tmp)
# For each directed dialogue
df_1_dir <- create_df_directed(df_0_all_movies)
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
# For each character
df_1_both <- create_df_both(df_0_all_movies)
## Warning: Setting row names on a tibble is deprecated.
# Get all unique characters who interacted with another (either from or to)
all_char <- append(df_1_dir$from, df_1_dir$to) %>% unique()
df_1_char <- data.frame(from = all_char)
df_1_char <- merge(df_1_char, df_1_dir[,c("from", "movie")], by = "from") %>% unique() %>% arrange(movie)
# Save to csv
write.csv(df_1_char,"0_unique_char.csv", row.names = F, fileEncoding = 'utf8')
# 모든 영화를 한번에 graph 만들기
# Apply
movie_list <- c("snow", "cind", "sleeping", "mermaid", "mermaid2", "beauty", "beauty17", "aladdin", "aladdin19",
"poca", "mulan", "frog", "tangled", "brave", "frozen", "frozen2", "moana", "raya",
"toystory1", "toystory2", "toystory3", "toystory4",
"incredibles1", "incredibles2", "inside", "nemo")
for (i in 1:length(movie_list)){
tmp <- load_and_n_words(movie_list[i])
tmp$movie <- movie_list[i]
my_net <- create_igraph(tmp)
assign(paste0("net_", movie_list[i]), my_net)
remove(tmp, my_net)
}
픽사 영화들은 노드 간 거리를 좀 두자
named_char_major <- read.csv('0-named-char-major2.csv', na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8") %>%
rename(from = X.U.FEFF.from)
movie_list <- c("snow", "cind", "sleeping", "mermaid", "mermaid2", "beauty", "beauty17", "aladdin", "aladdin19",
"poca", "mulan", "frog", "tangled", "brave", "frozen", "frozen2", "moana", "raya",
"toystory1", "toystory2", "toystory3", "toystory4",
"incredibles1", "incredibles2", "inside", "nemo")
movie_name_list <- c("Snow White and the Seven Dwarfs", "Cinderella", "Sleeping Beauty", "Litte Mermaid", "Little Mermaid 2",
"Beauty and the Beast", "Beauty and the Beast (2017)", "Aladdin", "Aladdin (2019)",
"Pocahontas", "Mulan", "Princess and the Frog", "Tangled", "Brave", "Frozen", "Frozen 2", "Moana", "Raya and the Last Dragon",
"Toy Story 1", "Toy Story 2", "Toy Story 3", "Toy Story 4",
"Incredibles", "Incredibles 2", "Inside Out", "Finding Nemo")
plot_list = list()
for (i in 1:length(movie_list)){
# load each movie
tmp <- load_and_n_words(movie_list[i])
tmp$movie <- movie_list[i]
# create network
my_net <- create_igraph(tmp)
# save network object
assign(paste0("net_", movie_list[i]), my_net)
# plot
my_plot <- plot_char_net_rm_isolates(my_net, movie_name_list[i])
my_plot
# save plots to a list
plot_list[[i]] <- my_plot
remove(tmp, my_net, my_plot)
}
# Apply
movie_list <- c("snow", "cind", "sleeping", "mermaid", "mermaid2", "beauty", "beauty17", "aladdin", "aladdin19",
"poca", "mulan", "frog", "tangled", "brave", "frozen", "frozen2", "moana", "raya",
"toystory1", "toystory2", "toystory3", "toystory4",
"incredibles1", "incredibles2", "inside", "nemo")
year_list <- c(1937, 1950, 1959, 1989, 2000, 1991, 2017, 1992, 2019,
1995, 1998, 2009, 2010, 2012, 2013, 2019, 2016, 2021,
1995, 1999, 2010, 2019,
2004, 2018, 2015, 2003)
df_2_centrality <- data.frame()
for (i in 1:length(movie_list)){
tmp <- load_and_n_words(movie_list[i])
tmp$movie <- movie_list[i]
my_net <- create_igraph(tmp)
my_centrality <- compute_centrality(my_net, movie_list[i])
df_2_centrality <- rbind(df_2_centrality, my_centrality)
remove(tmp, my_net)
}
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'from'. You can override using the `.groups` argument.
# Get character names
named_char <- read.csv('0-named-char.csv', na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8") %>%
rename(from = X.U.FEFF.from)
char_names <- named_char$from
# Get years
year <- read.csv('0-year.csv', na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8") %>%
rename(movie = X.U.FEFF.movie)
# Merge char names and year
df_2_centrality <- merge(df_2_centrality, named_char[,c("from", "gender")], by = "from", all.x = TRUE)
for (i in 1:length(movie_list)){
df_2_centrality$year[df_2_centrality$movie == movie_list[i]] <- year_list[i]
}
df_2_centrality$year <- as.factor(df_2_centrality$year)
remove(id, od, bet, clo)
Toy Story의 경우 line plot으로 각 캐릭터의 centrality가 1-4편으로 갈수록 어떻게 바뀌는지 봐야할듯
df_2_centrality 만드는 코드
# Number of words - PIXAR
df <- df_2_centrality %>%
drop_na(gender) %>%
group_by(movie) %>%
mutate(n_words_perc = round(n_words/sum(n_words)*100, 2)) %>%
arrange(year, from)
df_2_centrality_cl <- df
df$pixar <- ifelse(grepl("toystory", df$movie), "Pixar",
ifelse(grepl("incredibles", df$movie), "Pixar", "DisneyPrincess"))
여성 1인당 말한 정도 and 남성 1인당 말한 정도 (perc_per)
df_4_movie_name <- read.csv('0_movie_name.csv', na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8") %>%
rename(movie = X.U.FEFF.movie)
# n_words_total aggregated by gender
df_2_perc_words <- df_2_centrality_cl %>%
group_by(movie, gender) %>%
summarize(perc_subtotal = sum(n_words_perc),
pop_subtotal = n()) %>%
mutate(perc_per = perc_subtotal/pop_subtotal)
## `summarise()` has grouped output by 'movie'. You can override using the `.groups` argument.
df_2_perc_words <- merge(df_2_perc_words,
df_2_centrality_cl[,c("movie", "year", "gender")],
by = c("movie", "gender"),
all.x = TRUE) %>% unique()
df_2_perc_words <- merge(df_2_perc_words,
df_4_movie_name[,c("movie", "company", "movie_name")],
by = c("movie"),
all.x = TRUE)
df <- df_2_perc_words
# df
df_tmp <- df %>% filter(gender == 0)
# Subtotal for all women
ggplot(df,
aes(x=reorder(movie_name, as.numeric(year)), y=perc_subtotal, group = as.factor(gender)), label = df_tmp$movie_name) +
geom_line(aes(color=as.factor(gender)), cex = 0.45)+
geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
# scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
scale_shape_manual(values=c(16, 3), name = "Company") +
ylab("% of number of words spoken") +
xlab("Film") +
ggtitle("How much do women characters talk?") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(size=14, face="bold"))
remove(df_tmp)
# Per person for each gender
ggplot(df, aes(x=reorder(movie_name, as.numeric(year)), y=perc_per, group = as.factor(gender)), label = df$movie_name) +
geom_line(aes(color=as.factor(gender)), cex = 0.45)+
geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
# scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
scale_shape_manual(values=c(16, 3), name = "Company", labels = c("Disney Princess", "Pixar")) +
ylab("% of number of words spoken (per person)") +
xlab("Film") +
ggtitle("How much do each gender talk per one person?") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(size=14, face="bold"))
df <- df_2_perc_words
df_tmp <- df %>% filter(gender == 1) %>%
mutate(men_talk = perc_subtotal)
df <- merge(df, df_tmp[,c("movie_name", "men_talk")], by = "movie_name", all.x = TRUE)
df
## movie_name movie gender perc_subtotal
## 1 Aladdin aladdin 0 10.41
## 2 Aladdin aladdin 1 89.58
## 3 Aladdin (2019) aladdin19 0 18.78
## 4 Aladdin (2019) aladdin19 1 81.24
## 5 Beauty and the Beast beauty 0 31.61
## 6 Beauty and the Beast beauty 1 68.41
## 7 Beauty and the Beast (2017) beauty17 0 31.40
## 8 Beauty and the Beast (2017) beauty17 1 68.59
## 9 Brave brave 0 73.87
## 10 Brave brave 1 26.13
## 11 Cinderella cind 0 61.56
## 12 Cinderella cind 1 38.45
## 13 Finding Nemo nemo 0 23.73
## 14 Finding Nemo nemo 1 76.26
## 15 Frozen frozen 0 49.03
## 16 Frozen frozen 1 50.97
## 17 Frozen 2 frozen2 0 54.58
## 18 Frozen 2 frozen2 1 45.43
## 19 Incredibles 1 incredibles1 0 46.39
## 20 Incredibles 1 incredibles1 1 53.60
## 21 Incredibles 2 incredibles2 0 46.84
## 22 Incredibles 2 incredibles2 1 53.14
## 23 Inside Out inside 0 64.23
## 24 Inside Out inside 1 35.78
## 25 Little Mermaid mermaid 0 34.93
## 26 Little Mermaid mermaid 1 65.09
## 27 Little Mermaid 2 mermaid2 0 54.09
## 28 Little Mermaid 2 mermaid2 1 45.92
## 29 Moana moana 0 50.36
## 30 Moana moana 1 49.65
## 31 Mulan mulan 0 26.02
## 32 Mulan mulan 1 73.98
## 33 Pocahontas poca 0 31.94
## 34 Pocahontas poca 1 68.05
## 35 Raya and the Last Dragon raya 0 77.60
## 36 Raya and the Last Dragon raya 1 22.39
## 37 Sleeping Beauty sleeping 0 74.69
## 38 Sleeping Beauty sleeping 1 25.32
## 39 Snow White and the Seven Dwarfs snow 0 47.41
## 40 Snow White and the Seven Dwarfs snow 1 52.59
## 41 Tangled tangled 0 54.38
## 42 Tangled tangled 1 45.62
## 43 The Princess and the Frog frog 0 41.47
## 44 The Princess and the Frog frog 1 58.54
## 45 Toy Story 1 toystory1 0 8.61
## 46 Toy Story 1 toystory1 1 91.41
## 47 Toy Story 2 toystory2 0 12.43
## 48 Toy Story 2 toystory2 1 87.57
## 49 Toy Story 3 toystory3 0 22.47
## 50 Toy Story 3 toystory3 1 77.50
## 51 Toy Story 4 toystory4 0 40.93
## 52 Toy Story 4 toystory4 1 59.06
## pop_subtotal perc_per year company men_talk
## 1 3 3.470000 1992 DisneyPrincess 89.58
## 2 19 4.714737 1992 DisneyPrincess 89.58
## 3 6 3.130000 2019 DisneyPrincess 81.24
## 4 15 5.416000 2019 DisneyPrincess 81.24
## 5 7 4.515714 1991 DisneyPrincess 68.41
## 6 16 4.275625 1991 DisneyPrincess 68.41
## 7 14 2.242857 2017 DisneyPrincess 68.59
## 8 18 3.810556 2017 DisneyPrincess 68.59
## 9 6 12.311667 2012 DisneyPrincess 26.13
## 10 8 3.266250 2012 DisneyPrincess 26.13
## 11 6 10.260000 1950 DisneyPrincess 38.45
## 12 7 5.492857 1950 DisneyPrincess 38.45
## 13 7 3.390000 2003 Pixar 76.26
## 14 20 3.813000 2003 Pixar 76.26
## 15 5 9.806000 2013 DisneyPrincess 50.97
## 16 22 2.316818 2013 DisneyPrincess 50.97
## 17 7 7.797143 2019 DisneyPrincess 45.43
## 18 15 3.028667 2019 DisneyPrincess 45.43
## 19 13 3.568462 2004 Pixar 53.60
## 20 23 2.330435 2004 Pixar 53.60
## 21 8 5.855000 2018 Pixar 53.14
## 22 18 2.952222 2018 Pixar 53.14
## 23 8 8.028750 2015 Pixar 35.78
## 24 10 3.578000 2015 Pixar 35.78
## 25 8 4.366250 1989 DisneyPrincess 65.09
## 26 12 5.424167 1989 DisneyPrincess 65.09
## 27 8 6.761250 2000 DisneyPrincess 45.92
## 28 18 2.551111 2000 DisneyPrincess 45.92
## 29 5 10.072000 2016 DisneyPrincess 49.65
## 30 7 7.092857 2016 DisneyPrincess 49.65
## 31 9 2.891111 1998 DisneyPrincess 73.98
## 32 21 3.522857 1998 DisneyPrincess 73.98
## 33 3 10.646667 1995 DisneyPrincess 68.05
## 34 12 5.670833 1995 DisneyPrincess 68.05
## 35 11 7.054545 2021 DisneyPrincess 22.39
## 36 7 3.198571 2021 DisneyPrincess 22.39
## 37 5 14.938000 1959 DisneyPrincess 25.32
## 38 5 5.064000 1959 DisneyPrincess 25.32
## 39 2 23.705000 1937 DisneyPrincess 52.59
## 40 10 5.259000 1937 DisneyPrincess 52.59
## 41 2 27.190000 2010 DisneyPrincess 45.62
## 42 11 4.147273 2010 DisneyPrincess 45.62
## 43 5 8.294000 2009 DisneyPrincess 58.54
## 44 20 2.927000 2009 DisneyPrincess 58.54
## 45 4 2.152500 1995 Pixar 91.41
## 46 13 7.031538 1995 Pixar 91.41
## 47 10 1.243000 1999 Pixar 87.57
## 48 19 4.608947 1999 Pixar 87.57
## 49 12 1.872500 2010 Pixar 77.50
## 50 28 2.767857 2010 Pixar 77.50
## 51 26 1.574231 2019 Pixar 59.06
## 52 25 2.362400 2019 Pixar 59.06
# Subtotal for all women
ggplot(df,
aes(x=reorder(movie_name, as.numeric(men_talk)),
y=perc_subtotal,
fill = as.factor(gender),
pattern= company)) +
geom_bar_pattern(position = "fill", stat = "identity", width = 0.5,
color = "black",
pattern_fill = "darkgrey",
pattern_angle = 45,
pattern_density = 0.03,
pattern_spacing = 0.04,
pattern_key_scale_factor = 0.6,
pattern_alpha = 0.6,
alpha = 0.7) +
# geom_line(aes(color=as.factor(gender)), cex = 0.45)+
# geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
# scale_fill_discrete(name = "Gender", labels = c("Woman", "Man")) +
scale_fill_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
# scale_shape_manual(values=c(16, 3), name = "Company") +
scale_pattern_manual(values = c(DisneyPrincess = "none", Pixar = "stripe"), name = "Company",
labels = c("Disney Princess", "Pixar")) +
ylab("Percent of number of words spoken") +
scale_y_continuous(expand = c(0, 0)) +
xlab("Film") +
ggtitle("How much do women characters talk?") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(size=14, face="bold")) +
coord_flip()
* https://stackoverflow.com/questions/22945651/remove-space-between-plotted-data-and-the-axes *
# centrality aggregated by gender
df_2_cent_agg <- df_2_centrality_cl %>%
group_by(movie, gender) %>%
summarize(id_subtotal = sum(id),
od_subtotal = sum(od),
bet_subtotal = sum(bet),
clo_subtotal = sum(clo),
pop_subtotal = n()) %>%
mutate(id_avg = id_subtotal/pop_subtotal,
od_avg = od_subtotal/pop_subtotal,
bet_avg = bet_subtotal/pop_subtotal,
clo_avg = clo_subtotal/pop_subtotal)
## `summarise()` has grouped output by 'movie'. You can override using the `.groups` argument.
df_2_cent_agg <- merge(df_2_cent_agg,
df_2_centrality_cl[,c("movie", "year", "gender")],
by = c("movie", "gender"),
all.x = TRUE) %>% unique()
df_2_cent_agg <- merge(df_2_cent_agg,
df_4_movie_name[,c("movie", "company", "movie_name")],
by = c("movie"),
all.x = TRUE)
df <- df_2_cent_agg
df
## movie gender id_subtotal od_subtotal bet_subtotal clo_subtotal
## 1 aladdin 0 10 9 92.0 0.001994976
## 2 aladdin 1 47 48 490.0 0.011986981
## 3 aladdin19 0 15 16 155.0 0.002771864
## 4 aladdin19 1 45 39 529.0 0.007879078
## 5 beauty 0 23 23 244.0 0.001767533
## 6 beauty 1 49 47 378.0 0.004263861
## 7 beauty17 0 35 35 716.5 0.004169734
## 8 beauty17 1 53 49 1139.5 0.005184970
## 9 brave 0 20 15 107.0 0.005017272
## 10 brave 1 15 18 83.0 0.009916703
## 11 cind 0 29 31 244.0 0.004760541
## 12 cind 1 19 12 111.0 0.004458738
## 13 frog 0 35 30 906.5 0.002058810
## 14 frog 1 46 47 778.0 0.007658809
## 15 frozen 0 31 21 525.5 0.001171757
## 16 frozen 1 49 49 1067.5 0.005063107
## 17 frozen2 0 27 28 347.0 0.008796414
## 18 frozen2 1 33 32 406.5 0.017508230
## 19 incredibles1 0 33 32 1128.0 0.002764981
## 20 incredibles1 1 61 58 2655.0 0.005063317
## 21 incredibles2 0 45 41 1033.0 0.002643835
## 22 incredibles2 1 61 57 1418.0 0.005820550
## 23 inside 0 34 26 402.0 0.002125551
## 24 inside 1 19 17 186.0 0.002562292
## 25 mermaid 0 18 17 197.0 0.006987713
## 26 mermaid 1 33 34 360.0 0.007231591
## 27 mermaid2 0 35 32 421.0 0.003025539
## 28 mermaid2 1 35 36 534.5 0.006583553
## 29 moana 0 15 14 102.0 0.004032799
## 30 moana 1 11 11 75.0 0.006343107
## 31 mulan 0 26 26 478.0 0.003427057
## 32 mulan 1 58 55 1101.0 0.008415391
## 33 nemo 0 23 22 394.0 0.001623869
## 34 nemo 1 96 81 2760.5 0.004837884
## 35 poca 0 10 11 55.0 0.003303143
## 36 poca 1 38 37 316.0 0.012786503
## 37 raya 0 31 29 364.0 0.008780085
## 38 raya 1 11 13 53.0 0.005895338
## 39 sleeping 0 20 23 126.0 0.014473174
## 40 sleeping 1 12 8 8.0 0.008620217
## 41 snow 0 13 13 102.0 0.003875765
## 42 snow 1 32 29 120.5 0.018322089
## 43 tangled 0 7 7 16.0 0.004206349
## 44 tangled 1 21 21 93.0 0.022173783
## 45 toystory1 0 6 9 15.0 0.001819299
## 46 toystory1 1 46 42 346.5 0.006604050
## 47 toystory2 0 13 16 296.0 0.002975672
## 48 toystory2 1 81 70 1374.0 0.005619041
## 49 toystory3 0 34 36 376.0 0.004044206
## 50 toystory3 1 100 98 2739.5 0.009741828
## 51 toystory4 0 74 64 2724.5 0.006052527
## 52 toystory4 1 77 82 2734.0 0.005991913
## pop_subtotal id_avg od_avg bet_avg clo_avg year company
## 1 3 3.333333 3.000000 30.666667 0.0006649919 1992 DisneyPrincess
## 2 19 2.473684 2.526316 25.789474 0.0006308937 1992 DisneyPrincess
## 3 6 2.500000 2.666667 25.833333 0.0004619773 2019 DisneyPrincess
## 4 15 3.000000 2.600000 35.266667 0.0005252719 2019 DisneyPrincess
## 5 7 3.285714 3.285714 34.857143 0.0002525047 1991 DisneyPrincess
## 6 16 3.062500 2.937500 23.625000 0.0002664913 1991 DisneyPrincess
## 7 14 2.500000 2.500000 51.178571 0.0002978382 2017 DisneyPrincess
## 8 18 2.944444 2.722222 63.305556 0.0002880539 2017 DisneyPrincess
## 9 6 3.333333 2.500000 17.833333 0.0008362120 2012 DisneyPrincess
## 10 8 1.875000 2.250000 10.375000 0.0012395879 2012 DisneyPrincess
## 11 6 4.833333 5.166667 40.666667 0.0007934234 1950 DisneyPrincess
## 12 7 2.714286 1.714286 15.857143 0.0006369625 1950 DisneyPrincess
## 13 5 7.000000 6.000000 181.300000 0.0004117621 2009 DisneyPrincess
## 14 20 2.300000 2.350000 38.900000 0.0003829405 2009 DisneyPrincess
## 15 5 6.200000 4.200000 105.100000 0.0002343515 2013 DisneyPrincess
## 16 22 2.227273 2.227273 48.522727 0.0002301412 2013 DisneyPrincess
## 17 7 3.857143 4.000000 49.571429 0.0012566305 2019 DisneyPrincess
## 18 15 2.200000 2.133333 27.100000 0.0011672154 2019 DisneyPrincess
## 19 13 2.538462 2.461538 86.769231 0.0002126909 2004 Pixar
## 20 23 2.652174 2.521739 115.434783 0.0002201442 2004 Pixar
## 21 8 5.625000 5.125000 129.125000 0.0003304794 2018 Pixar
## 22 18 3.388889 3.166667 78.777778 0.0003233639 2018 Pixar
## 23 8 4.250000 3.250000 50.250000 0.0002656939 2015 Pixar
## 24 10 1.900000 1.700000 18.600000 0.0002562292 2015 Pixar
## 25 8 2.250000 2.125000 24.625000 0.0008734641 1989 DisneyPrincess
## 26 12 2.750000 2.833333 30.000000 0.0006026326 1989 DisneyPrincess
## 27 8 4.375000 4.000000 52.625000 0.0003781924 2000 DisneyPrincess
## 28 18 1.944444 2.000000 29.694444 0.0003657530 2000 DisneyPrincess
## 29 5 3.000000 2.800000 20.400000 0.0008065599 2016 DisneyPrincess
## 30 7 1.571429 1.571429 10.714286 0.0009061581 2016 DisneyPrincess
## 31 9 2.888889 2.888889 53.111111 0.0003807842 1998 DisneyPrincess
## 32 21 2.761905 2.619048 52.428571 0.0004007329 1998 DisneyPrincess
## 33 7 3.285714 3.142857 56.285714 0.0002319813 2003 Pixar
## 34 20 4.800000 4.050000 138.025000 0.0002418942 2003 Pixar
## 35 3 3.333333 3.666667 18.333333 0.0011010478 1995 DisneyPrincess
## 36 12 3.166667 3.083333 26.333333 0.0010655419 1995 DisneyPrincess
## 37 11 2.818182 2.636364 33.090909 0.0007981896 2021 DisneyPrincess
## 38 7 1.571429 1.857143 7.571429 0.0008421911 2021 DisneyPrincess
## 39 5 4.000000 4.600000 25.200000 0.0028946347 1959 DisneyPrincess
## 40 5 2.400000 1.600000 1.600000 0.0017240434 1959 DisneyPrincess
## 41 2 6.500000 6.500000 51.000000 0.0019378827 1937 DisneyPrincess
## 42 10 3.200000 2.900000 12.050000 0.0018322089 1937 DisneyPrincess
## 43 2 3.500000 3.500000 8.000000 0.0021031746 2010 DisneyPrincess
## 44 11 1.909091 1.909091 8.454545 0.0020157984 2010 DisneyPrincess
## 45 4 1.500000 2.250000 3.750000 0.0004548246 1995 Pixar
## 46 13 3.538462 3.230769 26.653846 0.0005080038 1995 Pixar
## 47 10 1.300000 1.600000 29.600000 0.0002975672 1999 Pixar
## 48 19 4.263158 3.684211 72.315789 0.0002957390 1999 Pixar
## 49 12 2.833333 3.000000 31.333333 0.0003370172 2010 Pixar
## 50 28 3.571429 3.500000 97.839286 0.0003479224 2010 Pixar
## 51 26 2.846154 2.461538 104.788462 0.0002327895 2019 Pixar
## 52 25 3.080000 3.280000 109.360000 0.0002396765 2019 Pixar
## movie_name
## 1 Aladdin
## 2 Aladdin
## 3 Aladdin (2019)
## 4 Aladdin (2019)
## 5 Beauty and the Beast
## 6 Beauty and the Beast
## 7 Beauty and the Beast (2017)
## 8 Beauty and the Beast (2017)
## 9 Brave
## 10 Brave
## 11 Cinderella
## 12 Cinderella
## 13 The Princess and the Frog
## 14 The Princess and the Frog
## 15 Frozen
## 16 Frozen
## 17 Frozen 2
## 18 Frozen 2
## 19 Incredibles 1
## 20 Incredibles 1
## 21 Incredibles 2
## 22 Incredibles 2
## 23 Inside Out
## 24 Inside Out
## 25 Little Mermaid
## 26 Little Mermaid
## 27 Little Mermaid 2
## 28 Little Mermaid 2
## 29 Moana
## 30 Moana
## 31 Mulan
## 32 Mulan
## 33 Finding Nemo
## 34 Finding Nemo
## 35 Pocahontas
## 36 Pocahontas
## 37 Raya and the Last Dragon
## 38 Raya and the Last Dragon
## 39 Sleeping Beauty
## 40 Sleeping Beauty
## 41 Snow White and the Seven Dwarfs
## 42 Snow White and the Seven Dwarfs
## 43 Tangled
## 44 Tangled
## 45 Toy Story 1
## 46 Toy Story 1
## 47 Toy Story 2
## 48 Toy Story 2
## 49 Toy Story 3
## 50 Toy Story 3
## 51 Toy Story 4
## 52 Toy Story 4
# centrality measures
centrality_lineplot <- function(y, ylab, title, subtitle){
p <- ggplot(df,
aes(x=reorder(movie_name, as.numeric(year)), y=y, group = as.factor(gender)), label = df$movie_name) +
geom_line(aes(color=as.factor(gender)), cex = 0.45)+
geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
# scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
scale_shape_manual(values=c(16, 3), name = "Company") +
ylab("") +
xlab("Film") +
labs(title = title,
subtitle = subtitle) +
theme_minimal() +
theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(size = 13, face="bold"))
return(p)
}
centrality_lineplot(df$id_subtotal, "In-degree centrality (subtotal)", "Are women spoken to more characters than men?", "Indegree centrality (subtotal)")
centrality_lineplot(df$id_avg, "In-degree centrality (average)", "Is one woman spoken to more characters than one man?", "Indegree centrality (average)")
centrality_lineplot(df$od_subtotal, "Out-degree centrality (subtotal)", "Do women speak to more characters than men?", "Outdegree centrality (subtotal)")
centrality_lineplot(df$od_avg, "Out-degree centrality (average)", "Does one woman speak to more characters than one man?", "Outdegree centrality (average)")
centrality_lineplot(df$bet_subtotal, "Betweenness centrality (subtotal)", "Are women acting like a bridge more than men?", "Betweenness centrality (subtotal)")
centrality_lineplot(df$bet_avg, "Betweenness centrality (average)", "Does one woman act like a bridge more than one man?", "Betweenness centrality (average)")
centrality_lineplot(df$clo_subtotal, "Closeness centrality (subtotal)", "Are women in the center of the network?", "Closeness centrality (subtotal)")
centrality_lineplot(df$clo_avg, "Closeness centrality (average)", "Is one woman in the center more than one man?", "Closeness centrality (average)")
ts <- c("toystory1", "toystory2", "toystory3", "toystory4")
inc <- c("incredibles1", "incredibles2")
mer <- c("mermaid", "mermaid2")
ala <- c("aladdin", "aladdin19")
bea <- c("beauty", "beauty17")
fro <- c("frozen", "frozen2")
seq <- c("toystory1", "toystory2", "toystory3", "toystory4", "incredibles1", "incredibles2",
"mermaid", "mermaid2", "aladdin", "aladdin19", "beauty", "beauty17", "frozen", "frozen2")
df_tmp <- df_2_perc_words %>%
filter(movie %in% seq) %>%
mutate(series = ifelse(movie %in% ts, "Toy Story",
ifelse(movie %in% inc, "Incredibles",
ifelse(movie %in% mer, "Little Mermaid",
ifelse(movie %in% ala, "Aladdin",
ifelse(movie %in% bea, "Beauty and the Beast", "Frozen"))))))
# Subtotal for all women
ggplot(df_tmp,
aes(x=movie_name, y=perc_subtotal, group = as.factor(gender)), label = df_tmp$movie_name) +
geom_line(aes(color=as.factor(gender)), cex = 0.45)+
geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
# scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
scale_shape_manual(values=c(16, 3), name = "Company") +
ylab("% of number of words spoken") +
xlab("Film") +
ggtitle("How much do women characters talk?") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(size=14, face="bold"),
strip.text = element_blank()) +
facet_grid(. ~ series, scales = "free")
# Per person for each gender
ggplot(df_tmp,
aes(x=movie_name, y=perc_per, group = as.factor(gender)), label = df_tmp$movie_name) +
geom_line(aes(color=as.factor(gender)), cex = 0.45)+
geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
# scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
scale_shape_manual(values=c(16, 3), name = "Company") +
ylab("% of number of words spoken (per person)") +
xlab("Film") +
ggtitle("How much do each gender talk per one person?") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(size=14, face="bold"),
strip.text = element_blank()) +
facet_grid(. ~ series, scales = "free")
remove(df_tmp)
centrality_lineplot_seq <- function(y, ylab, title, subtitle){
p <- ggplot(df,
aes(x=reorder(movie_name, as.numeric(year)), y=y, group = as.factor(gender)), label = df$movie_name) +
geom_line(aes(color=as.factor(gender)), cex = 0.45)+
geom_point(aes(color=as.factor(gender), shape = company), cex = 2.5) +
scale_color_manual(values = c("#009E73", "#E69F00"), name = "Gender", labels = c("Woman", "Man")) +
# scale_color_discrete(name = "Gender", labels = c("Woman", "Man")) +
scale_shape_manual(values=c(16, 3), name = "Company") +
ylab("") +
xlab("Film") +
labs(title = title,
subtitle = subtitle) +
theme_minimal() +
theme(axis.text.x = element_text(size = 8, angle = 90, vjust = 0.5, hjust=1),
plot.title = element_text(size = 13, face="bold")) +
facet_grid(. ~ series, scales = "free")
return(p)
}
# 1) Compute Degree Centrality (defined by Park et al., 2011) -> X
unique_char <- unique(df_1_dir$from)
tmp_out <- df_1_dir%>%
group_by(movie, from) %>%
summarize(out_weight = sum(weight)) %>%
rename(char_name = from)
## `summarise()` has grouped output by 'movie'. You can override using the `.groups` argument.
tmp_in <- df_1_dir %>%
group_by(movie, to) %>%
summarize(in_weight = sum(weight)) %>%
rename(char_name = to)
## `summarise()` has grouped output by 'movie'. You can override using the `.groups` argument.
tmp <- merge(tmp_out, tmp_in, by = c("char_name", "movie"), all.y = TRUE) %>%
na.omit()
tmp$boonja <- tmp$out_weight + tmp$in_weight
tmp$multiplied <- tmp$out_weight*tmp$in_weight
boonmo <- 2*sum(tmp$multiplied)
tmp$dc <- tmp$boonja/boonmo
df_4_dc <- tmp
remove(tmp, tmp_in, tmp_out)
files <- list.files(path = "4_ergm_models/")
df_5_ergm <- data.frame()
for (i in 1:length(files)) {
df_tmp = read.csv(paste0("4_ergm_models/", files[i]), header = T, sep = ",")
df_tmp = subset(df_tmp, select = -c(std.error, mcmc.error, statistic, p.value))
df_5_ergm <- rbind(df_5_ergm, df_tmp)
}
write.csv(df_5_ergm, file.path("4_ergm_models_edited/0_ergm_models.csv"))
# Apply
movie_list <- c("snow", "cind", "sleeping", "mermaid", "mermaid2", "beauty", "beauty17", "aladdin", "aladdin19",
"poca", "mulan", "frog", "tangled", "brave", "frozen", "frozen2", "moana", "raya",
"toystory1", "toystory2", "toystory3", "toystory4",
"incredibles1", "incredibles2", "inside", "nemo")
# rbind all films into one dataframe for each model
df_5_m1 <- data.frame()
df_5_m2 <- data.frame()
df_5_m3 <- data.frame()
df_5_m4 <- data.frame()
for (i in 1:4){
df_tmp1 <- data.frame()
for (j in 1:length(movie_list)){
df_tmp2 <- read.csv(paste0("4_ergm_models/m", i, "_", movie_list[j], ".csv"),
na.strings=NA, header = TRUE, sep = ",", encoding = "UTF-8")
df_tmp2$film <- ""
df_tmp2$film[1] <- movie_list[j]
df_tmp2 <- df_tmp2 %>%
select(film, everything())
df_tmp1 <- rbind(df_tmp1, df_tmp2)
}
if (i == 1){
df_5_m1 <- df_tmp1
}
if (i == 2){
df_5_m2 <- df_tmp1
}
if (i == 3){
df_5_m3 <- df_tmp1
}
if (i == 4){
df_5_m4 <- df_tmp1
}
}